In [1]:
import sqlite3
conn = sqlite3.connect('pmcv1-full.db')
c = conn.cursor()
available tables
SQL tables:
In [1]:
import graph_tool as gt
In [2]:
import cPickle as pickle
In [4]:
g = pickle.load(open("full_graph.p", "rb"))
pmid_vertex_dict = pickle.load(open("full_graph_pmid_vertex_dict.p", "rb"))
In [5]:
#plots
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
In [6]:
#plot in degree
indegree = []
for vertex in g.vertices():
indegree.append(vertex.in_degree())
plt.hist(indegree, np.arange(0,1000,20), log = True)
Out[6]:
In [7]:
#plot out degree
outdegree = []
for vertex in g.vertices():
outdegree.append(vertex.out_degree())
plt.hist(outdegree, np.arange(1,500,10), log = True)
Out[7]:
In [8]:
# Let's plot its in-degree distribution
import graph_tool.stats as gtstats
in_hist = gtstats.vertex_hist(g, "in")
y = in_hist[0]
plt.figure(figsize=(10,6))
plt.errorbar(in_hist[1][:-1], in_hist[0], fmt="o", label="in")
plt.gca().set_yscale("log")
plt.gca().set_xlim(1, 1e3)
plt.gca().set_ylim(1.5,)
plt.xlabel("$k_{in}$")
plt.ylabel("$NP(k_{in})$")
plt.tight_layout()
In [11]:
#to save for web
# Let's plot its in-degree distribution
import graph_tool.stats as gtstats
in_hist = gtstats.vertex_hist(g, "in")
import matplotlib
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
y = in_hist[0]
plt.figure(figsize=(6,4))
plt.errorbar(in_hist[1][:-1], in_hist[0], fmt="o", label="in", alpha=0.5)
plt.gca().set_yscale("log")
plt.gca().set_xlim(1, 1e3)
plt.gca().set_ylim(1.5,)
plt.xlabel("Number of in-citations", fontsize=16)
plt.ylabel("Counts", fontsize=16)
plt.tight_layout()
plt.savefig('incites.svg')
In [7]:
# Let's plot its in-degree distribution
import graph_tool.stats as gtstats
in_hist = gtstats.vertex_hist(g, "out")
y = in_hist[0]
plt.figure(figsize=(10,6))
plt.errorbar(in_hist[1][:-1], in_hist[0], fmt="o", label="out")
plt.gca().set_yscale("log")
plt.gca().set_xlim(1, 600)
plt.gca().set_ylim(1.5,3e4)
plt.xlabel("$k_{in}$")
plt.ylabel("$NP(k_{in})$")
plt.tight_layout()
In [12]:
#to save for web
# Let's plot its in-degree distribution
import graph_tool.stats as gtstats
in_hist = gtstats.vertex_hist(g, "out")
import matplotlib
matplotlib.rc('xtick', labelsize=14)
matplotlib.rc('ytick', labelsize=14)
y = in_hist[0]
plt.figure(figsize=(6,4))
plt.errorbar(in_hist[1][:-1], in_hist[0], fmt="o", label="out", alpha = 0.5)
plt.gca().set_yscale("log")
plt.gca().set_xlim(1, 600)
plt.gca().set_ylim(1.5,3e4)
plt.xlabel("Number of out-citations", fontsize=16)
plt.ylabel("Counts", fontsize=16)
plt.tight_layout()
plt.savefig('outcites.svg')
In [28]:
g
Out[28]:
In [59]:
rootpmid = 26247944
c.execute('''SELECT refpmid FROM refs WHERE pmid = ?''', [rootpmid])
output = c.fetchall()
print output
In [51]:
for pmid in output:
c.execute('''SELECT refpmid FROM refs WHERE pmid = ?''', [pmid[0]])
print c.fetchall()
In [60]:
def addedge(graphobject, source, dest, vertexdict):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict
In [65]:
minigraph = gt.Graph()
rootpmid = 26247944
c.execute('''SELECT refpmid FROM refs WHERE pmid = ?''', [rootpmid])
children = c.fetchall()
vertexdict = dict()
for child in children:
minigraph, vertexdict = addedge(minigraph, rootpmid, child[0], vertexdict)
c.execute('''SELECT refpmid FROM refs WHERE pmid = ?''', [child[0]])
secondchildren = c.fetchall()
for child2 in secondchildren:
minigraph, vertexdict = addedge(minigraph, child[0], child2[0], vertexdict)
In [78]:
minigraph
Out[78]:
In [212]:
import graph_tool.all as gt
import math
In [213]:
deg = minigraph.degree_property_map("in")
#deg.a = 4 * (math.sqrt(deg.a) * 0.5 + 0.4)
gt.graph_draw(minigraph, vertex_fill_color=deg)
Out[213]:
In [ ]:
In [ ]:
# CO-AUTHORSHIP NETWORK???
# https://graph-tool.skewed.de/static/doc/draw.html
In [ ]:
In [150]:
rev_pmid_vertex_dict = {v: k for k, v in pmid_vertex_dict.items()}
In [202]:
def addedge(graphobject, source, dest, vertexdict):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict
In [258]:
rootpmid = 26247944
minigraph2 = gt.Graph()
vertexdict2 = dict()
for neigh in g.vertex(pmid_vertex_dict[rootpmid]).out_neighbours():
minigraph2, vertexdict2 = addedge(minigraph2, rootpmid, neigh, vertexdict2)
try:
for neigh2 in g.vertex(neigh).out_neighbours():
minigraph2, vertexdict2 = addedge(minigraph2, neigh, neigh2, vertexdict2)
except KeyError:
print "{} not in graph g".format(neigh)
In [259]:
deg = minigraph2.degree_property_map("in")
gt.graph_draw(minigraph2, vertex_fill_color=deg)
Out[259]:
In [273]:
deg = minigraph2.degree_property_map("in")
deg.a = 2 * (np.sqrt(deg.a) * 0.5 + 0.4)
ebet = gt.betweenness(minigraph2)[1]
### for inline support
from IPython.display import SVG, display
###
gt.graphviz_draw(minigraph2, vcolor=deg, vorder=deg, elen=10, ecolor=ebet,
eorder=ebet, output = "/tmp/3961631087.svg", output_format="svg",
size = (25,25))
display(SVG(filename = "/tmp/3961631087.svg"))
In [214]:
pos = draw.arf_layout(minigraph2)
gt.graph_draw(minigraph2, pos=pos)
Out[214]:
In [215]:
pos = draw.sfdp_layout(minigraph2, cooling_step=0.95)
gt.graph_draw(minigraph2, pos=pos)
Out[215]:
In [196]:
pos = draw.arf_layout(minigraph2)
deg = minigraph2.degree_property_map("in")
gt.graph_draw(minigraph2, pos=pos, vertex_fill_color=deg)
Out[196]:
In [219]:
state = gt.minimize_nested_blockmodel_dl(minigraph2, deg_corr=True)
gt.draw_hierarchy(state)
Out[219]:
In [217]:
gt.graph_draw(minigraph2, pos=draw.sfdp_layout(minigraph2, cooling_step=0.99),
vertex_fill_color=minigraph2.vertex_index, vertex_size=2,
edge_pen_width=1)
Out[217]:
In [238]:
# LOOK INTO CO-AUTHORSHIP NETWORKS
# https://graph-tool.skewed.de/static/doc/draw.html
# http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2721762/
g = minigraph2
g.purge_vertices()
state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True)
t = gt.get_hierarchy_tree(state)[0]
tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
cts = gt.get_hierarchy_control_points(g, t, tpos)
pos = g.own_property(tpos)
b = state.levels[0].b
shape = b.copy()
shape.a %= 14
gt.graph_draw(g, pos=pos, vertex_fill_color=b, vertex_shape=shape, edge_control_points=cts,
edge_color=[0, 0, 0, 0.3], vertex_anchor=0)
Out[238]:
In [242]:
deg = minigraph2.degree_property_map("in")
deg.a = 4 * (np.sqrt(deg.a) * 0.5 + 0.4)
ebet = gt.betweenness(minigraph2)[1]
ebet.a /= ebet.a.max() / 10.
eorder = ebet.copy()
eorder.a *= -1
pos = gt.sfdp_layout(minigraph2)
control = g.new_edge_property("vector<double>")
for e in g.edges():
d = np.sqrt(sum((pos[e.source()].a - pos[e.target()].a) ** 2)) / 5
control[e] = [0.3, d, 0.7, d]
gt.graph_draw(minigraph2, pos=pos, vertex_size=deg, vertex_fill_color=deg, vorder=deg,
edge_color=ebet, eorder=eorder, edge_pen_width=ebet,
edge_control_points=control)
# SFDP force-directed layout of a Price network with 1500 nodes. The vertex size and color indicate the degree,
# and the edge color and width the edge betweenness centrality.
# https://graph-tool.skewed.de/static/doc/draw.html
Out[242]:
In [218]:
gt.graph_draw(minigraph2, pos=draw.sfdp_layout(minigraph2, cooling_step=0.99),
vertex_fill_color=minigraph2.vertex_index, vertex_size=2,
edge_pen_width=1)
Out[218]:
In [ ]:
will make sense to trim down dataset to papers that have interesting citation networks within PMC, since many papers do not! This is a reasonable excuse for this, will also shrink data to make it more managable for dynamic plotting on the webserver. Can I make a statement about liklihood of being in PMC open access by field? Keyword?
Rather than using graph network for dynamic portion of website, to save resources, I can precompute in-neighbors and out-neighbors and save into a sql table and just call these. Alo, in terms of excluding papers that aren't interesting (not in PMC) - rather than doing this, perhaps just don't have them autocomplete in the search window and/or de-rank them in results? Could also color them differently in the graph networks (e.g. red) to indicate that the network terminates there, without specifying why (not in PMC vs too new)?
In [ ]:
In [245]:
rootpmid = 26247944
minigraph2 = gt.Graph()
vertexdict2 = dict()
for neigh in g.vertex(pmid_vertex_dict[rootpmid]).out_neighbours():
minigraph2, vertexdict2 = addedge(minigraph2, rootpmid, neigh, vertexdict2)
try:
for neigh2 in g.vertex(neigh).out_neighbours():
minigraph2, vertexdict2 = addedge(minigraph2, neigh, neigh2, vertexdict2)
try:
for neigh3 in g.vertex(neigh2).out_neighbours():
minigraph2, vertexdict2 = addedge(minigraph2, neigh2, neigh3, vertexdict2)
except KeyError:
print "3rd degree node {} not in graph g".format(neigh)
except KeyError:
print "2nd degree node {} not in graph g".format(neigh)
In [247]:
deg = minigraph2.degree_property_map("in")
gt.graph_draw(minigraph2, vertex_fill_color=deg)
Out[247]:
In [402]:
def addedge(graphobject, source, dest, vertexdict):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict
#def addlayer(graphobject, rootnode, vertexdict, direction = 'out'):
# if direction == 'out':
# for neigh in graphobject.vertex(rootnode).out_neighbors():
# graphobject, vertexdict = addedge(graphobject, rootnode, neigh, vertexdict)
# elif direction == 'in':
# for neigh in graphobject.vertex(rootnode).in_neighbors():
# graphobject, vertexdict = addedge(graphobject, rootnode, neigh, vertexdict)
# else:
# print "Specify direction as either in or out"
import Queue
import graph_tool.all as gt
def buildlocalgraph(rootnode, mastergraph, indepth = 0, outdepth = 2):
_g =gt.Graph()
_vertexdict = dict()
q = Queue.Queue()
q.put((rootnode, 0))
#first go in out direction
while not q.empty():
node = q.get()
if node[1] <= outdepth:
try:
for neigh in _g.vertex(node[0]).out_neighbors():
_g, _vertexdict = addedge(_g, node[0], neigh, _vertexdict)
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
def buildlocalgraph(rootnode, mastergraph, indepth = 0, outdepth = 2):
_g =gt.Graph()
_vertexdict = dict()
q = Queue.Queue()
#first go in out direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < outdepth:
try:
for neigh in mastergraph.vertex(node[0]).out_neighbours():
_g, _vertexdict = addedge(_g, node[0], neigh, _vertexdict)
q.put((neigh, node[1]+1))
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
#now go in in direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < indepth:
try:
for neigh in mastergraph.vertex(node[0]).in_neighbours():
_g, _vertexdict = addedge(_g, neigh, node[0], _vertexdict)
q.put((neigh, node[1]+1))
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
return _g, _vertexdict
In [343]:
rootpmid = 26247944
minigraph3, vertexdict3 = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 2, 0)
In [383]:
def countoutneigh(g, node):
count = 0
for neigh in g.vertex(node).out_neighbours(): count+= 1
return count
def countinneigh(g, node):
count = 0
for neigh in g.vertex(node).in_neighbours(): count+= 1
return count
In [384]:
countoutneigh(g, 5690856)
Out[384]:
In [389]:
for neigh in g.vertex(5690856).out_neighbours():
if countoutneigh(g, neigh) > 0: print countoutneigh(g, neigh), neigh
In [393]:
countinneigh(g, 2399401)
Out[393]:
In [398]:
for i in range(2399400, 2399402):
print countinneigh(g, i)
In [439]:
minigraph3, vertexdict3 = buildlocalgraph(2399401, g, 10, 10)
In [440]:
deg = minigraph3.degree_property_map("in")
gt.graph_draw(minigraph3, vertex_fill_color=deg, output_size=(1200,1200))
Out[440]:
In [418]:
minigraph3.purge_vertices()
state = gt.minimize_nested_blockmodel_dl(minigraph3, deg_corr=True)
t = gt.get_hierarchy_tree(state)[0]
tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
cts = gt.get_hierarchy_control_points(minigraph3, t, tpos)
pos = minigraph3.own_property(tpos)
b = state.levels[0].b
shape = b.copy()
shape.a %= 14
gt.graph_draw(minigraph3, pos=pos, vertex_fill_color=b, vertex_shape=shape, edge_control_points=cts,
edge_color=[0, 0, 0, 0.3], vertex_anchor=0)
Out[418]:
In [421]:
minigraph3, vertexdict3 = buildlocalgraph(2399401, g, 10, 10)
state = gt.minimize_nested_blockmodel_dl(minigraph3, deg_corr=True)
gt.draw_hierarchy(state)
Out[421]:
In [451]:
import math
g = minigraph3
print(g.num_vertices(), g.num_edges())
#reduce to only connected nodes
g = gt.GraphView(g,vfilt=lambda v: (v.out_degree() > 0) and (v.in_degree() > 0) )
g.purge_vertices()
print(g.num_vertices(), g.num_edges())
#use 1->Republican, 2->Democrat
red_blue_map = {1:(1,0,0,1),0:(0,0,1,1)}
plot_color = g.new_vertex_property('vector<double>')
g.vertex_properties['plot_color'] = plot_color
#for v in g.vertices():
# plot_color[v] = red_blue_map[g.vertex_properties['value'][v]] ######### USE THIS FOR GENDER??
#### THIS WAS USED FOR RED/BLUE POLITICAL AFIL
### SEE: http://stackoverflow.com/questions/238724/visualizing-undirected-graph-thats-too-large-for-graphviz
#edge colors
alpha=0.15
edge_color = g.new_edge_property('vector<double>')
g.edge_properties['edge_color']=edge_color
for e in g.edges():
if plot_color[e.source()] != plot_color[e.target()]:
if plot_color[e.source()] == (0,0,1,1):
#orange on dem -> rep
edge_color[e] = (255.0/255.0, 102/255.0, 0/255.0, alpha)
else:
edge_color[e] = (102.0/255.0, 51/255.0, 153/255.0, alpha)
#red on rep-rep edges
elif plot_color[e.source()] == (1,0,0,1):
edge_color[e] = (1,0,0, alpha)
#blue on dem-dem edges
else:
edge_color[e] = (0,0,1, alpha)
state = gt.minimize_nested_blockmodel_dl(g, deg_corr=True)
bstack = state.get_bstack()
#t = gt.get_hierarchy_tree(bstack)[0]
tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
cts = gt.get_hierarchy_control_points(g, t, tpos)
pos = g.own_property(tpos)
b = bstack[0].vp["b"]
#labels
text_rot = g.new_vertex_property('double')
g.vertex_properties['text_rot'] = text_rot
for v in g.vertices():
if pos[v][0] >0:
text_rot[v] = math.atan(pos[v][1]/pos[v][0])
else:
text_rot[v] = math.pi + math.atan(pos[v][1]/pos[v][0])
gt.graph_draw(g, pos=pos, vertex_fill_color=g.vertex_properties['plot_color'],
vertex_color=g.vertex_properties['plot_color'],
edge_control_points=cts,
vertex_size=10,
#vertex_text=g.vertex_properties['label'],
vertex_text_rotation=g.vertex_properties['text_rot'],
vertex_text_position=1,
vertex_font_size=9,
edge_color=g.edge_properties['edge_color'],
vertex_anchor=0,
#bg_color=[0,0,0,1],
output_size=[4024,4024])
#output='polblogs_blockmodel.png')
Out[451]:
In [452]:
import cPickle as pickle
g = pickle.load(open("full_graph.p", "rb"))
pmid_vertex_dict = pickle.load(open("full_graph_pmid_vertex_dict.p", "rb"))
rev_pmid_vertex_dict = {v: k for k, v in pmid_vertex_dict.items()}
In [461]:
for node in range(5000):
inn = countinneigh(g,node)
outn = countoutneigh(g, node)
if (inn > 30) & (outn > 30): print node, rev_pmid_vertex_dict[node]
In [473]:
rootpmid = 20502679
minigraph3, vertexdict3 = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 2, 5)
deg = minigraph3.degree_property_map("out") #out AND in how?
gt.graph_draw(minigraph3, vertex_fill_color=deg, output_size=(1200,1200))
Out[473]:
In [477]:
minigraph3, vertexdict3 = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 5, 5)
minigraph3.purge_vertices()
state = gt.minimize_nested_blockmodel_dl(minigraph3, deg_corr=True)
t = gt.get_hierarchy_tree(state)[0]
tpos = pos = gt.radial_tree_layout(t, t.vertex(t.num_vertices() - 1), weighted=True)
cts = gt.get_hierarchy_control_points(minigraph3, t, tpos)
pos = minigraph3.own_property(tpos)
b = state.levels[0].b
shape = b.copy()
shape.a %= 14
gt.graph_draw(minigraph3, pos=pos, vertex_fill_color=b, vertex_shape=shape, edge_control_points=cts,
edge_color=[0, 0, 0, 0.3], vertex_anchor=0)
Out[477]:
In [476]:
minigraph3, vertexdict3 = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 5, 5)
state = gt.minimize_nested_blockmodel_dl(minigraph3, deg_corr=True)
gt.draw_hierarchy(state)
Out[476]:
labels: http://stackoverflow.com/questions/24221792/graph-tool-draw-graph-with-user-defined-vertex-text
In [2]:
def addedge(graphobject, source, dest, vertexdict, v_label):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
v_label[v] = str(rev_pmid_vertex_dict[dest])
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
v_label[v] = str(rev_pmid_vertex_dict[dest])
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict, v_label
import Queue
import graph_tool.all as gt
def buildlocalgraph(rootnode, mastergraph, indepth = 0, outdepth = 2):
_g =gt.Graph()
_vertexdict = dict()
q = Queue.Queue()
v_label = _g.new_vertex_property("string")
#first go in out direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < outdepth:
try:
for neigh in mastergraph.vertex(node[0]).out_neighbours():
_g, _vertexdict, v_label = addedge(_g, node[0], neigh, _vertexdict, v_label)
q.put((neigh, node[1]+1))
except KeyError:
print "{} degree node {} not in graph g".format(node[1], node[0])
#now go in in direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < indepth:
try:
for neigh in mastergraph.vertex(node[0]).in_neighbours():
_g, _vertexdict, v_label = addedge(_g, neigh, node[0], _vertexdict, v_label)
q.put((neigh, node[1]+1))
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
return _g, _vertexdict, v_label
In [542]:
rootpmid = 20502679
#rev_pmid_vertex_dict = {v: k for k, v in pmid_vertex_dict.items()}
minigraph3, vertexdict3, v_label = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 0, 2)
deg = minigraph3.degree_property_map("out") #out AND in how?
gt.graph_draw(minigraph3, vertex_fill_color=deg, vertex_text=v_label, output_size=(1200,1200))
Out[542]:
In [544]:
rootpmid = 20502679
#rev_pmid_vertex_dict = {v: k for k, v in pmid_vertex_dict.items()}
minigraph3, vertexdict3, v_label = buildlocalgraph(pmid_vertex_dict[rootpmid], g, 2, 2)
deg = minigraph3.degree_property_map("out") #out AND in how?
gt.graph_draw(minigraph3, vertex_fill_color=deg, output_size=(1200,1200))
Out[544]:
In [587]:
import collections
import nltk
import string
import nltk.stem.porter
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
tokens = map(string.lower, nltk.word_tokenize(teststring))
stemmer = nltk.stem.porter.PorterStemmer()
stemmed = stem_tokens(tokens, stemmer)
count = collections.Counter(stemmed)
print count.most_common(100)
In [611]:
rootpmid = 20502679
c.execute('''SELECT abstract FROM abstracts WHERE pmid > ? AND pmid < ?''', [rootpmid-100, rootpmid+100])
corpus = c.fetchall()
print len(corpus)
#corpusflat = ['' + corpus[i] for i in range(len(corpus))]
for i, entry in enumerate(corpus):
corpus[i] = str(corpus[i])
In [612]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
len(feature_names)
Out[612]:
In [613]:
tfidf_matrix
Out[613]:
In [615]:
episode = corpus[0]
phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
Out[615]:
In [628]:
dense = tfidf_matrix.todense()
episode = dense[1].tolist()[0]
phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
top10 = sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
for entry in top10:
print feature_names[entry[0]]
In [634]:
c.execute('''SELECT pmid, abstract FROM abstracts''')
corpus = c.fetchall()
print len(corpus)
corpuspmid = []
#corpusflat = ['' + corpus[i] for i in range(len(corpus))]
for i, entry in enumerate(corpus):
corpuspmid.append(corpus[i][0])
corpus[i] = unicode(corpus[i][1])
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')
tfidf_matrix = tf.fit_transform(corpus)
feature_names = tf.get_feature_names()
len(feature_names)
In [ ]:
article = corpus[0]
phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
sorted(phrase_scores, key=lambda t: t[1] * -1)[:5]
SECOND APPROACH SEE SECOND NOTEBOOK 270616 gensim TFIDF
In [ ]:
import sqlite3
conn = sqlite3.connect('pmcv1-full.db')
c = conn.cursor()
c.execute('''SELECT pmid, abstract FROM abstracts''')
corpus = c.fetchall()
print len(corpus)
corpuspmid = []
for i, entry in enumerate(corpus):
corpuspmid.append(corpus[i][0])
corpus[i] = unicode(corpus[i][1])
In [ ]:
import gensim
testcorpus = []
for i in range(1000):
testcorpus.append(corpus[i])
In [ ]:
In [ ]:
In [ ]:
See: http://www-personal.umich.edu/~mejn/papers/cnlspre.pdf Initially, what I want to do is
In [1]:
import sqlite3
conn = sqlite3.connect('pmcv1-full.db')
c = conn.cursor()
In [2]:
rootpmid = 20502679
c.execute('''SELECT pmid, fn, ln FROM authors WHERE pmid > ? AND pmid < ?''', [rootpmid-100, rootpmid+100])
authors = c.fetchall()
print len(authors)
In [11]:
#fullnames:
authors[i][1]+authors[i][2]
Out[11]:
In [3]:
#import graph_tool.all as gt
import graph_tool as gt
In [5]:
import hashlib
def md5hash(string):
return hashlib.md5(string).hexdigest()
def addedge(graphobject, source, dest, vertexdict):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict
In [6]:
import itertools
g = gt.Graph(directed = False)
author_vertex_dict = dict()
author_full_name_dict = dict()
c.execute('''SELECT pmid, fn, ln FROM authors WHERE pmid > ? AND pmid < ?''', [rootpmid-100, rootpmid+100])
authors = c.fetchall()
authorspaper = []
currpaper = authors[0][0]
for entry in authors:
#accumulate by paper
authorcat = unicode(entry[1]+entry[2]).replace(" ", "").lower()
authorspaper.append(authorcat)
author_full_name_dict[authorcat] = (entry[1],entry[2])
if entry[0] != currpaper:
#add author nodes and edges
for comb in itertools.combinations(authorspaper, 2):
addedge(g, comb[0], comb[1], author_vertex_dict)
#reset and begin accumulating again
currpaper = entry[0]
authorspaper = []
In [7]:
g
Out[7]:
In [56]:
author_vertex_dict["serenafondaumani"]
Out[56]:
In [60]:
author_full_name_dict["serenafondaumani"]
Out[60]:
In [ ]:
#%matplotlib inline
#gt.graph_draw(g, output_size=(1200,1200), output_format="png")
In [12]:
#THIS CALCULATION IS SLOW!!!! Like 5 hours slow. SVE THE DICTS AND THE GRAPH WHEN DONE
In [1]:
import cPickle as pickle
g = pickle.load(open("authors_full_graph.p", "rb"))
author_vertex_dict = pickle.load(open("authors_vertex_dict.p", "rb"))
author_full_name_dict = pickle.load(open("authors_full_name_dict.p", "rb"))
in future may want to give edges weights, where appearing on one paper together = 1, 2 = 2, and so on.
In [2]:
def addedge(graphobject, source, dest, vertexdict):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict
import Queue
import graph_tool.all as gt
def buildlocalgraph(rootnode, mastergraph, indepth = 0, outdepth = 2):
_g =gt.Graph()
_vertexdict = dict()
q = Queue.Queue()
q.put((rootnode, 0))
#first go in out direction
while not q.empty():
node = q.get()
if node[1] <= outdepth:
try:
for neigh in _g.vertex(node[0]).out_neighbors():
_g, _vertexdict = addedge(_g, node[0], neigh, _vertexdict)
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
def buildlocalgraphundirected(rootnode, mastergraph, indepth = 0, outdepth = 2):
_g =gt.Graph(directed = False)
_vertexdict = dict()
q = Queue.Queue()
#first go in out direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < outdepth:
try:
for neigh in mastergraph.vertex(node[0]).out_neighbours():
_g, _vertexdict = addedge(_g, node[0], neigh, _vertexdict)
q.put((neigh, node[1]+1))
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
#now go in in direction
q.put((rootnode, 0))
while not q.empty():
node = q.get()
if node[1] < indepth:
try:
for neigh in mastergraph.vertex(node[0]).in_neighbours():
_g, _vertexdict = addedge(_g, neigh, node[0], _vertexdict)
q.put((neigh, node[1]+1))
except KeyError:
"{} degree node {} not in graph g".format(node[1], node[0])
return _g, _vertexdict
In [33]:
#plots
#%matplotlib inline
#import matplotlib.pyplot as plt
#import numpy as np
In [3]:
#author = u'martinjshipley'
author = u'emmanuellebouzigon'
minigraph3, vertexdict3 = buildlocalgraphundirected(author_vertex_dict[author], g, 1, 1)
deg = minigraph3.degree_property_map("out") #out AND in how?
gt.graph_draw(minigraph3, vertex_fill_color=deg, output_size=(1200,1200), inline=True)
Out[3]:
In [16]:
author_vertex_dict
Out[16]:
In [17]:
author = u'martinjshipley'
In [1]:
import graph_tool as gt
In [2]:
def addweightededge(graphobject, source, dest, vertexdict, weight):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
if graphobject.edge(vertexdict[source], vertexdict[dest]) == None:
e = graphobject.add_edge(vertexdict[source], vertexdict[dest])
weight[e] = 1
else:
weight[graphobject.edge(vertexdict[source], vertexdict[dest])] += 1
return graphobject, vertexdict
In [1]:
import sqlite3
conn = sqlite3.connect('pmcv1-full.db')
c = conn.cursor()
In [ ]:
import itertools
g = gt.Graph(directed = False)
edge_weight = g.new_edge_property("int32_t")
author_vertex_dict = dict()
author_full_name_dict = dict()
c.execute('''SELECT pmid, fn, ln FROM authors''')
authors = c.fetchall()
authorspaper = []
currpaper = authors[0][0]
cnt = 0
for entry in authors:
#accumulate by paper
authorcat = unicode(entry[1]+entry[2]).replace(" ", "").lower()
authorspaper.append(authorcat)
author_full_name_dict[authorcat] = (entry[1],entry[2])
if entry[0] != currpaper:
#add author nodes and edges
for comb in itertools.combinations(authorspaper, 2):
addweightededge(g, comb[0], comb[1], author_vertex_dict, edge_weight)
#reset and begin accumulating again
currpaper = entry[0]
authorspaper = []
cnt += 1
if cnt % 50000 == 0: print cnt #total 5,745,410 lines of authors in db
In [ ]:
import cPickle as pickle
pickle.dump(g, open("authors_full_graph_with_weights.p", "wb"))
pickle.dump(author_vertex_dict, open("authors_vertex_dict_weights.p", "wb"))
pickle.dump(author_full_name_dict, open("authors_full_name_dict_weights.p", "wb"))
pickle.dump(edge_weight, open("authors_full_graph_weights.p", "wb"))
once this is done on DO https://graph-tool.skewed.de/static/doc/flow.html#graph_tool.flow.min_cut
for not working without weights
In [1]:
import cPickle as pickle
g = pickle.load(open("authors_full_graph.p", "rb"))
author_vertex_dict = pickle.load(open("authors_vertex_dict.p", "rb"))
author_full_name_dict = pickle.load(open("authors_full_name_dict.p", "rb"))
In [30]:
rev_author_vertex_dict = {v: k for k, v in author_vertex_dict.items()}
In [2]:
# shortest path
# https://graph-tool.skewed.de/static/doc/topology.html?highlight=shortest%20path#graph_tool.topology.shortest_path
In [3]:
author_full_name_dict
Out[3]:
In [7]:
author_vertex_dict[u'kennethfmanly']
author_vertex_dict[u'nancybaker']
Out[7]:
In [71]:
import graph_tool.topology as topology
In [73]:
vlist, elist = topology.shortest_path(g, g.vertex(author_vertex_dict[u'kennethfmanly']),
g.vertex(author_vertex_dict[u'zhihuitong']))
In [ ]:
#topology.absolute_import
In [72]:
#topology.all_shortest_paths(g, g.vertex(author_vertex_dict[u'kennethfmanly']),
g.vertex(author_vertex_dict[u'zhihuitong']))
In [36]:
for edge in elist:
test= edge
In [46]:
int(test.target())
Out[46]:
In [91]:
def addedge(graphobject, source, dest, vertexdict, v_label):
if source not in vertexdict:
v = graphobject.add_vertex()
vertexdict[source] = int(v)
v_label[v] = str(rev_pmid_vertex_dict[dest])
if dest not in vertexdict:
v = graphobject.add_vertex()
vertexdict[dest] = int(v)
v_label[v] = str(rev_pmid_vertex_dict[dest])
graphobject.add_edge(vertexdict[source], vertexdict[dest])
return graphobject, vertexdict, v_label
import graph_tool as gt
def buildauthorgraph(nodelist, edgelist, authordict, mastergraph):
_g =gt.Graph(directed = False)
_vertexdict = dict()
v_label = _g.new_vertex_property("string")
#first add nodes
for node in nodelist:
v = _g.add_vertex()
_vertexdict[node] = int(v)
v_label[v] = str(author_full_name_dict[authordict[node]]).strip('()').strip("'").replace("u'", "").replace("',", "")
#now add edges
for edge in edgelist:
_g.add_edge(_vertexdict[int(edge.source())], _vertexdict[int(edge.target())])
return _g, _vertexdict, v_label
In [92]:
authg, authdict, authlab = buildauthorgraph(vlist, elist, rev_author_vertex_dict, g)
In [90]:
str(author_full_name_dict[rev_author_vertex_dict[11]]).strip('()').strip("'").replace("u'", "").replace("',", "")
Out[90]:
In [145]:
#see formatting at https://graph-tool.skewed.de/static/doc/draw.html
import graph_tool.all as gt
deg = authg.degree_property_map("out") #out AND in how?
gt.graph_draw(authg, vertex_fill_color=deg, vertex_text=authlab, output_size=(600,300),
vertex_text_position=3.14/4., vertex_size = 20, vertex_font_size = 20,
edge_pen_width = 6, inline=True)
Out[145]:
In [146]:
#all_shortest_paths not working on ubuntu either - may need debian
#http://main-discussion-list-for-the-graph-tool-project.982480.n3.nabble.com/Debian-package-and-boost-at-compile-time-td4026383i20.html
#https://graph-tool.skewed.de/static/doc/search_module.html#graph_tool.search.dijkstra_search
As seen: http://stackoverflow.com/questions/238724/visualizing-undirected-graph-thats-too-large-for-graphviz fig 7:http://www-personal.umich.edu/~mejn/papers/cnlspre.pdf
In [1]:
%reset -f
import cPickle as pickle
g = pickle.load(open("authors_full_graph.p", "rb"))
author_vertex_dict = pickle.load(open("authors_vertex_dict.p", "rb"))
author_full_name_dict = pickle.load(open("authors_full_name_dict.p", "rb"))
rev_author_vertex_dict = {v: k for k, v in author_vertex_dict.items()}
In [ ]: